Feature Selection can be performed by training a large, unpruned random forest then feature importance can be used to help with identifying the most important features (Jaiswal and Samikannu 2017), leading to a reduction in computational requirements
Biau and Scornet (Biau and Scornet 2016) present an extensive exploration of Random Forests, covering aspects of theoretical foundations, model evaluation, and empirical performance. They discuss how tree construction, feature selection, and ensemble learning aid in predictive analysis.
Filtration method is a feature selection technique that ranks characteristics of data according to weight. The features whose weights are greater than the threshold are retained to help improve accuracy of classification (Ren and Han 2017)
Boosting
South African Red List
Purpose
While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF <-
corrDF <- corrDF %>% select(!GenSpec) %>% mutate(across(c("Group","LF","GF","Range",
"Biomes","Range","Habitat_degradation",
"Habitat_loss","IAS","Other",
"Over_exploitation","Pollution","Unknown"),
as_factor))
ggcorrplot::ggcorrplot(DescTools::PairApply(corrDF,DescTools::CramerV), type='lower') Raw Factor
1 Parasitic 1
2 Tree 2
3 Suffrutex 3
corrDF <- train %>%
mutate(Range=ntile(Range, n=20))
corrDF <- corrDF %>%
mutate(across(c(
"Group","LF","GF","Range",
"Biomes","Range",
"Habitat_degradation",
"Habitat_loss", "IAS","Other",
"Over_exploitation","Pollution",
"Unknown"),as_factor))
printFactors=matrix(
c(train$GF[1],train$GF[2],
train$GF[3],corrDF$GF[1],
corrDF$GF[2],corrDF$GF[3]),
nrow=3)
colnames(printFactors)=c('Raw','Factor')
rownames(printFactors)=c('1','2','3')
print(printFactors,quote=FALSE)| GenSpec | Group | LF | GF |
|---|---|---|---|
| 1020_4001 | NotThr | Perennial | Parasitic |
| 1023_1 | Thr | Perennial | Tree |
| 1028_9 | NotThr | Perennial | Suffrutex |
| 1029_1 | NotThr | Annual | Herb |
| 1031_125 | Thr | Perennial | Lithophyte |
| GenSpec | LF | GF | Fam |
|---|---|---|---|
| 1020.400 | 0 | 1 | 1 |
| 1023.100 | 0 | 2 | 1 |
| 1028.900 | 0 | 3 | 2 |
| 1029.100 | 1 | 4 | 2 |
| 1031.125 | 0 | 5 | 2 |
Group Counts Pre-Balancing: 490 148 23
Group Counts Post-Balancing: 490 490 490
AB <- data_train
AB <- AB[AB$label != '3',]
AB_res <- ovun.sample(label ~ ., data = AB,
method = "over", N = 980,
seed = 1)$data
AC <- data_train
AC <- AC[AC$label != '2',]
AC_res <- ovun.sample(label ~ ., data = AC,
method = "over", N = 980,
seed = 1)$data
AB_2 <- AB_res[AB_res$label == '2',]
AC_3 <- AC_res[AC_res$label == '3',]
data_train_1 <- AB_res[AB_res$label == '1',]
data_train_combined <- rbind(data_train_1, AB_2, AC_3)
cat("Group Counts Pre-Balancing: ",
table(data_train$label),
"\nGroup Counts Post-Balancing: ",
table(data_train_combined$label))\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
trainMM <- as.data.frame(lapply(features_train,
function(x) {(x-min(x))/(max(x)-min(x))}))
testMM <- as.data.frame(lapply(features_test,
function(x) {(x-min(x))/(max(x)-min(x))}))
train <- trainMM
train$label <- label
MMCounts <- table(train$label)
MM <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
importanceMM = importance(MM)
PredMM <- predict(MM, testMM)
accuracy <- sum(label_test == PredMM) / length(label_test)
testfactorMM <- as.factor(label_test)
PredFactorMM <- as.factor(PredMM)
cm <- confusionMatrix(PredFactorMM, testfactorMM)
rownames(cm$byClass)<-c("NotThr","Thr","Ext")
recall <- mean(c(cm$byClass["NotThr", "Sensitivity"],
cm$byClass["Thr", "Sensitivity"],
cm$byClass["Ext", "Sensitivity"]))
precision <- mean(c(cm$byClass["NotThr", "Pos Pred Value"],
cm$byClass["Thr", "Pos Pred Value"],
cm$byClass["Ext", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
TrainZS <- as.data.frame(lapply(features_train,
function(x) {(x - mean(x))/sd(x)}))
TestZS <- as.data.frame(lapply(features_test,
function(x) {(x - mean(x))/sd(x)}))
train <- TrainZS
train$label <- label
ZSCounts <- table(train$label)
ZS <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
ImportanceZS = importance(ZS)
PredZS <- predict(ZS, TestZS)
accuracy <- sum(label_test == PredZS) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorZS <- as.factor(PredZS)
cm <- confusionMatrix(PredFactorZS, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\max(|X_{old}|)}\)
TrainMAV <- as.data.frame(lapply(features_train,
function(x) {x / max(abs(x))}))
TestMAV <- as.data.frame(lapply(features_test,
function(x) {x / max(abs(x))}))
train <- TrainMAV
train$label <- label
MAVCounts <- table(train$label)
MAV <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label),
ntree = 4)
ImportanceMAV = importance(MAV)
PredMAV <- predict(MAV, TestMAV)
accuracy <- sum(label_test == PredMAV) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorMAV <- as.factor(PredMAV)
cm <- confusionMatrix(PredFactorMAV, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\sum(|X_{old}|)}\)
trainL1 <- as.data.frame(lapply(features_train,
function(x) {x / sum(abs(x))}))
TestL1 <- as.data.frame(lapply(features_test,
function(x) {x / sum(abs(x))}))
train <- trainL1
train$label <- label
L1Counts <- table(train$label)
L1 <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
L1Importance = importance(L1)
PredL1 <- predict(L1, TestL1)
accuracy <- sum(label_test == PredL1) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorL1 <- as.factor(PredL1)
cm <- confusionMatrix(PredFactorL1, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\sqrt{\sum{X_{old}^2}}}\)
TrainL2 <- as.data.frame(lapply(features_train,
function(x) {x / sqrt(sum(x^2))}))
TestL2 <- as.data.frame(lapply(features_test,
function(x) {x / sqrt(sum(x^2))}))
train <- TrainL2
train$label <- label
L2Counts <- table(train$label)
L2 <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
L2Importance = importance(L2)
PredL2 <- predict(L2, TestL2)
accuracy <- sum(label_test == PredL2) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorL2 <- as.factor(PredL2)
cm <- confusionMatrix(PredFactorL2, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.92
Recall 0.82
Precision 0.88
F1 0.85
n <- length(PredMM)
final_pred <-rep(NA, n)
for(i in 1:n) {
preds <- c(PredMM[i], PredZS[i], PredMAV[i],
PredL1[i], PredL2[i])
final_pred[i] <-as.numeric(names(which.max(table(preds))))
}
accuracy <-sum(label_test == final_pred)/length(label_test)
final_pred_factor <-as.factor(final_pred)
label_test_factor <-as.factor(label_test)
cm_vote <-confusionMatrix(final_pred_factor, label_test_factor)
rownames(cm_vote$byClass)<-c("NotThr","Thr","Ext")
sensitivity_class1 <-cm_vote$byClass["NotThr", "Sensitivity"]
sensitivity_class2 <-cm_vote$byClass["Thr", "Sensitivity"]
sensitivity_class3 <-cm_vote$byClass["Ext", "Sensitivity"]
recall =(sensitivity_class1+sensitivity_class2+sensitivity_class3)/3
precision_class1 <-cm_vote$byClass["NotThr", "Pos Pred Value"]
precision_class2 <-cm_vote$byClass["Thr", "Pos Pred Value"]
precision_class3 <-cm_vote$byClass["Ext", "Pos Pred Value"]
precision=(precision_class1+precision_class2+precision_class3)/3
F1=2*recall*precision/(recall+precision)\(\text{Accuracy}\)
\(\text{Recall}\)
\(\text{Precision}\)
\(\text{F1}\)
\(=\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\)
\(=\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\)
\(=\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\)
\(=\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
Score
Accuracy 0.92
Recall 0.82
Precision 0.88
F1 0.85
n <- length(PredMM)
final_pred <-rep(NA, n)
for(i in 1:n) {
preds <- c(PredMM[i], PredZS[i], PredMAV[i],
PredL1[i], PredL2[i])
final_pred[i] <-as.numeric(names(
which.max(table(preds))))
}
accuracy <-sum(label_test == final_pred)/
length(label_test)
final_pred_factor<-as.factor(final_pred)
label_test_factor<-as.factor(label_test)
cm_vote <-confusionMatrix(final_pred_factor,
label_test_factor)
rownames(cm_vote$byClass)<-c("NotThr","Thr","Ext")
sensitivity_class1<-cm_vote$byClass[
"NotThr", "Sensitivity"]
sensitivity_class2<-cm_vote$byClass[
"Thr", "Sensitivity"]
sensitivity_class3<-cm_vote$byClass[
"Ext", "Sensitivity"]
recall =(sensitivity_class1+sensitivity_class2+
sensitivity_class3)/3
precision_class1<-cm_vote$byClass[
"NotThr", "Pos Pred Value"]
precision_class2<-cm_vote$byClass[
"Thr", "Pos Pred Value"]
precision_class3<-cm_vote$byClass[
"Ext", "Pos Pred Value"]
precision=(precision_class1+precision_class2+
precision_class3)/3
F1=2*recall*precision/(recall+precision)
printTable=matrix(c(round(accuracy,2),
round(recall,2),
round(precision,2),
round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable) Score
Accuracy 0.92
Accuracy p-value <.001
95% CI (0.88,0.94)
Kappa 0.77
NotThr Thr Ext
Sensitivity 0.99 0.68 0.80
Specificity 0.75 0.98 0.99
Pos Pred Value 0.92 0.91 0.80
Neg Pred Value 0.96 0.92 0.99
Precision 0.92 0.91 0.80
Recall 0.99 0.68 0.80
F1 0.95 0.78 0.80
Prevalence 0.74 0.22 0.04
Detection Rate 0.73 0.15 0.03
Detection Prevalence 0.80 0.17 0.04
Balanced Accuracy 0.87 0.83 0.90
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
cm <- confusionMatrix(final_pred_factor, label_test_factor)
cm_d <- as.data.frame(cm$table)
cm_st <-data.frame(cm$overall)
cm_st$cm.overall <- round(cm_st$cm.overall,2)
cm_d$diag <- cm_d$Prediction == cm_d$Reference
cm_d$ndiag <- cm_d$Prediction != cm_d$Reference
cm_d[cm_d == 0] <- NA
cm_d$Reference <- reverse.levels(cm_d$Reference)
cm_d$ref_freq <- cm_d$Freq * ifelse(is.na(cm_d$diag),-1,1)
plt1 <- ggplot(data = cm_d, aes(x = Prediction , y = Reference,
fill = Freq))+
scale_x_discrete(position = "top") +
geom_tile( data = cm_d,aes(fill = ref_freq)) +
scale_fill_gradient2(guide = FALSE ,low="red",high="mediumvioletred",
mid= "mistyrose",
midpoint = 0,na.value = 'white') +
geom_text(aes(label = Freq), color = 'black', size = 3)+
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
panel.border = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
)
plt2 <- tableGrob(cm_st)
grid.arrange(plt1, plt2, nrow = 1, ncol = 2,
top=textGrob("Confusion Matrix",gp=gpar(fontsize=25,font=1)))printTable=matrix(c(round(cm$overall['Accuracy'],2),
if(cm$overall['AccuracyPValue']<0.001){"<.001"}
else round(cm$overall['AccuracyPValue'],3),
paste("(",round(cm$overall['AccuracyLower'],2),
",",round(cm$overall['AccuracyUpper'],2),
")",sep=""),
round(cm$overall['Kappa'],2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Accuracy p-value','95% CI','Kappa')
print(printTable,quote=FALSE)
cmBC<-cm$byClass
rownames(cmBC)<-c("NotThr","Thr","Ext")
print(t(round(cmBC,2)),quote=FALSE)